from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document


# Simple and fast text extraction
async def load(path: str) -> list[Document]:
    loader = PyPDFLoader(path)
    pages = []
    async for page in loader.alazy_load():
        pages.append(page)
    return pages


async def extract_page_content(docs: list[Document]) -> str:
    """提取PDF每页的文本内容，返回内容列表"""
    return [doc.page_content for doc in docs]


async def extract_content(docs: list[str]) -> str:
    """提取PDF所有页面的文本内容"""
    return "\n".join(docs)
